#!/usr/bin/python
# -*- coding: utf-8 -*-
# ----------------------------------------------------------------------
# clex.py
#
# A lexer for ANSI C. Ignore some simple comments.
# ----------------------------------------------------------------------

import sys

#import ply.lex as lex
import lex

class clex:
    # Reserved words
    reserved = (
        'AUTO', 'BREAK', 'CASE', 'CHAR', 'CONST', 'CONTINUE', 'DEFAULT', 'DO', 'DOUBLE',
        'ELSE', 'ENUM', 'EXTERN', 'FLOAT', 'FOR', 'GOTO', 'IF', 'INT', 'LONG', 'REGISTER',
        'RETURN', 'SHORT', 'SIGNED', 'SIZEOF', 'STATIC', 'STRUCT', 'SWITCH', 'TYPEDEF',
        'UNION', 'UNSIGNED', 'VOID', 'VOLATILE', 'WHILE', 'CR',
        )

    tokens = reserved + (
        # new comments
        'CPCOMMENT', 'CCOMMENT', 'NEWCOMMENT', 

        # Literals (identifier, integer constant, float constant, string constant, char const)
        'ID', 'TYPEID', 'ICONST', 'HCONST', 'FCONST', 'SCONST', 'CCONST', 'ZHCN', 


        # Operators (+,-,*,/,%,|,&,~,^,<<,>>, ||, &&, !, <, <=, >, >=, ==, !=)
        'PLUS', 'MINUS', 'TIMES', 'DIVIDE', 'MOD',
        'OR', 'AND', 'NOT', 'XOR', 'LSHIFT', 'RSHIFT',
        'LOR', 'LAND', 'LNOT',
        'LT', 'LE', 'GT', 'GE', 'EQ', 'NE',
        
        # Assignment (=, *=, /=, %=, +=, -=, <<=, >>=, &=, ^=, |=)
        'EQUALS', 'TIMESEQUAL', 'DIVEQUAL', 'MODEQUAL', 'PLUSEQUAL', 'MINUSEQUAL',
        'LSHIFTEQUAL','RSHIFTEQUAL', 'ANDEQUAL', 'XOREQUAL', 'OREQUAL',

        # Increment/decrement (++,--)
        'PLUSPLUS', 'MINUSMINUS',

        # Structure dereference (->)
        'ARROW',

        # Conditional operator (?)
        'CONDOP',
        
        # Delimeters ( ) [ ] { } , . ; :
        'LPAREN', 'RPAREN',
        'LBRACKET', 'RBRACKET',
        'LBRACE', 'RBRACE',
        'COMMA', 'PERIOD', 'SEMI', 'COLON',

        # Ellipsis (...)
        'ELLIPSIS',
        )

    # Completely ignored characters
    t_ignore           = ' \r\t\x0c'
        
    # Operators
    t_PLUS             = r'\+'
    t_MINUS            = r'-'
    t_TIMES            = r'\*'
    t_DIVIDE           = r'/'

    t_MOD              = r'%'
    t_OR               = r'\|'
    t_AND              = r'&'
    t_NOT              = r'~'
    t_XOR              = r'\^'
    t_LSHIFT           = r'<<'
    t_RSHIFT           = r'>>'
    t_LOR              = r'\|\|'
    t_LAND             = r'&&'
    t_LNOT             = r'!'
    t_LT               = r'<'
    t_GT               = r'>'
    t_LE               = r'<='
    t_GE               = r'>='
    t_EQ               = r'=='
    t_NE               = r'!='

    # Assignment operators

    t_EQUALS           = r'='
    t_TIMESEQUAL       = r'\*='
    t_DIVEQUAL         = r'/='
    t_MODEQUAL         = r'%='
    t_PLUSEQUAL        = r'\+='
    t_MINUSEQUAL       = r'-='
    t_LSHIFTEQUAL      = r'<<='
    t_RSHIFTEQUAL      = r'>>='
    t_ANDEQUAL         = r'&='
    t_OREQUAL          = r'\|='
    t_XOREQUAL         = r'^='

    # Increment/decrement
    t_PLUSPLUS         = r'\+\+'
    t_MINUSMINUS       = r'--'

    # ->
    t_ARROW            = r'->'

    # ?
    t_CONDOP           = r'\?'

    # Delimeters
    t_LPAREN           = r'\('
    t_RPAREN           = r'\)'
    t_LBRACKET         = r'\['
    t_RBRACKET         = r'\]'
    t_LBRACE           = r'\{'
    t_RBRACE           = r'\}'
    t_COMMA            = r','
    t_PERIOD           = r'\.'
    t_SEMI             = r';'
    t_COLON            = r':'
    t_ELLIPSIS         = r'\.\.\.'

    # Identifiers and reserved words

    # Integer literal
    t_ICONST = r'\d+([uU]|[lL]|[uU][lL]|[lL][uU])?'

    def t_HCONST(self, t):
        r'((0x)|(0X))[0-9a-fA-F]+([uU]|[lL]|[uU][lL]|[lL][uU])?'
        t.type = 'ICONST'
        return t

    # Floating literal
    t_FCONST = r'((\d+)(\.\d+)(e(\+|-)?(\d+))? | (\d+)e(\+|-)?(\d+))([lL]|[fF])?'

    # String literal
    #t_SCONST = r'\"([^\\\n]|(\\.))*?\"'
    def t_SCONST(self, t):
        r'\"([^\\\n]|(\\.))*?\"'
        t.type = 'SCONST'
        t.value = t.value[1:-1]
        return t

    # Character constant 'c' or L'c'
    t_CCONST = r'(L)?\'([^\\\n]|(\\.))*?\''

    reserved_map = { }
    for r in reserved:
        reserved_map[r.lower()] = r

    # as a construtor, import comment variable
    def __init__(self, comment, autobuild=1):
        self.cm_map = comment
        self.keyword_map = {}
        if autobuild==1:
            self.build(lextab="clextab")

    def load_keyword_from_file(self, filename):
        """
        @brief 从文件中装载keyword列表
        @filename 文件名
        """
        try:
            f = open(filename)
            last_key = 'AUTO'
            for line in f:
                if line[0] != '#':  # #打头则忽略
                    ws = line.split('=')
                    # 如果长度大于2，更新类型
                    if (len(ws) >=2 ):
                        last_key = ws[1].strip().upper()
                    key = ws[0].strip()
                    if (key):
                        self.keyword_map[key] = last_key
            #print self.keyword_map
        except:
            pass

    # Newlines
    def t_NEWLINE(self, t):
        r'\n+'
        t.lexer.lineno += t.value.count("\n")

    def t_ZHCN(self, t):
        r'`.*?`'
        t.type = "ID"
        t.value = t.value[1:-1]
        return t

    def t_ID(self, t):
        r'[A-Za-z_][\w_]*'
        t.type = self.reserved_map.get(t.value,"ID")
        if (    len(t.value) >2
                and  t.value[-1].lower() == 't'     # end with _t and _T will be recognized as TYPE
                and t.value[-2] == '_'
                ):
            t.type = 'INT'
            return t
        elif(t.value.upper() == 'CR'):
            # 忽略CR
            pass
        elif(
                t.value.upper() in ('FAR', 'STATIC')):
            # if get a FAR, 忽略FAR
            t.type = 'AUTO'
        elif (self.keyword_map.has_key(t.value)):
            t.type = self.keyword_map[t.value]
            return t
        else:
            pass
            return t

    def t_NEWCOMMENT(self, t):
        r'//.*?\n'
        t.lexer.lineno += 1

    # postfix comment 后缀注释
    def t_CPCOMMENT(self, t):
        # bind it to current line 绑定到当前行
        r'/\*[\*|-]<(.|\n)*?\*/'
        t.lexer.lineno += t.value.count('\n')
        res = t.value[4:-2]
        self.cm_map[t.lexer.lineno] = res
        #self.cm_map[t.lexer.lineno] = t.value
        #return t

    # prefix comment 前缀注释
    def t_CCOMMENT(self, t):
        # bind it to next line 绑定到下一行
        r'/\*[\*|-](.|\n)*?\*/'
        t.lexer.lineno += t.value.count('\n')
        # get comment content 
        res = t.value[4:-2]
        #res = re.search('[^\*<]+.*?', t.value)
        self.cm_map[t.lexer.lineno+1] = res
        #lang_c.ache.comment_map[t.lexer.lineno+1] = t.value
        #return t

    # Comments
    def t_comment(self, t):
        r'/\*(.|\n)*?\*/'
        t.lexer.lineno += t.value.count('\n')

    # Preprocessor directive (ignored)
    def t_preprocessor(self, t):
        r'\#(.)*?\n'
        t.lexer.lineno += 1
        
    def t_error(self, t):
        print("c illegal character %s @L%d" % (repr(t.value[0]), t.lexer.lineno))
        t.lexer.skip(1)

    # use class method, supply build 
    def build(self, **kwargs):
        self.lexer = lex.lex(module=self, **kwargs)

    def test(self, data):
        self.lexer.input(data)
        while True:
             tok = lexer.token()
             if not tok: break
             print tok

class vlex(clex):
    def __init__(self, autobuild=1):
        clex.__init__(self, {}, 0)
        self.keyword_map = {}
        if autobuild==1:
            self.build(lextab="vlextab")

    # use class method, supply build 
    def build(self, **kwargs):
        self.lexer = lex.lex(module=self, **kwargs)

    def t_ID2(self, t):
        r'[A-Za-z_][\w_.]*'
        t.type = self.reserved_map.get(t.value,"ID")
        if (    len(t.value) >2
                and  t.value[-1].lower() == 't'     # end with _t and _T will be recognized as TYPE
                and t.value[-2] == '_'
                ):
            t.type = 'INT'
            return t
        elif(t.value.upper() == 'CR'):
            # 忽略CR
            pass
        elif(
                t.value.upper() == 'FAR'):
            # if get a FAR, 忽略FAR
            t.type = 'AUTO'
        elif (self.keyword_map.has_key(t.value)):
            t.type = self.keyword_map[t.value]
            return t
        else:
            pass
            return t
    #t_ID = t_ID2

if __name__ == "__main__":
    # a simple unit test 
    cm = {}
    mylex = clex(cm)
    #mylex = vlex()
    #mylex.build(lextab="clextab")
    lex.runmain(mylex.lexer)
    print(cm)
